%pip install gensim==3.7
%pip install gensim==3.7
import matplotlib.pyplot as plt
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
def get_embed_df(words):
# Create df
vecs = [Vectors.dense([val.item() for val in word_vectors[word]]) for word in words]
df_list = [(i, word, vec) for i, (word, vec) in enumerate(zip(words, vecs))]
df = spark.createDataFrame(df_list, ["id", "word", "vectors"])
# Reduce to 2 dim for plotting
pca = PCA(k=2, inputCol="vectors", outputCol="2d_vectors")
model = pca.fit(df)
return model.transform(df)
words = ["man", "woman", "king", "queen", "object"]
embed_df = get_embed_df(words)
# Get the 4 vectors we are interested in
vectors = [row[0] for row in embed_df.select("2d_vectors").collect()[:4]]
# Plot
def plot_vectors(vectors, words, title="Visualizing Word Embeddings", xlim1=-2, xlim2=5, ylim1=-.5, ylim2=4.5):
for coord,word in zip(vectors, words):
plt.quiver(0, 0, coord[0], coord[1], angles="xy", scale_units="xy", scale=1)
plt.text(coord[0]+0.05, coord[1], word)
plt.title(title)
plt.xlim(xlim1, xlim2)
plt.ylim(ylim1, ylim2)
plot_vectors(vectors, words)
display(plt.show())
plt.gcf().clear()
# TODO
# Plots 4 vectors from graph above
plot_vectors(vectors, words, "Visualizing 'woman+king-man'")
# Woman + king
#w_plus_k = # FILL_IN
w_plus_k = vectors[words.index("woman")] + vectors[words.index("king")]
# Woman + king - man
#w_plus_k_minus_m = # FILL_IN
w_plus_k_minus_m = w_plus_k - vectors[words.index("man")]
# formats new vectors and texts in graph
# w_plus_k
plt.quiver(0, 0, w_plus_k[0], w_plus_k[1], angles='xy', scale_units='xy', scale=1, color = "blue")
plt.text(w_plus_k[0]+0.1, w_plus_k[1], "woman+king")
# w_plus_k_minus_m
plt.quiver(w_plus_k[0], w_plus_k[1], w_plus_k_minus_m[0]-w_plus_k[0], w_plus_k_minus_m[1]-w_plus_k[1],
angles='xy', scale_units='xy', scale=1, color = "red")
plt.text(w_plus_k_minus_m[0]+0.1, w_plus_k_minus_m[1]+0.05, "(woman+king)-man")
display(plt.show())
plt.gcf().clear()
# TODO
import numpy as np
def cos_similarity(v1, v2):
return np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))# FILL_IN
word1 = "queen"
word2 = "king"
ans = cos_similarity(word_vectors[word1], word_vectors[word2])
print(ans)
assert round(ans, 3) == round(word_vectors.similarity(word1, word2), 3), "Your answer does not match Gensim's"
Last refresh: Never